################################################################################
#' Sample script for analysis in:
#' @article{nkonde2021,
#'   author = {Nkonde, Mutale and 
#'             Rodriguez, Maria Y. and
#'             Cortana, Leonard and
#'             Mukogosi, Joan K. and
#'             King, Shakira and
#'             Serrato, Ray and
#'             Martinez, Natalie and
#'             Drummer, Mary and
#'             Lewis, Ann and
#'             Malik, Momin M.},
#'   year = {2021},
#'   title = {Disinformation creep: 
#'            ADOS and the strategic weaponization of breaking news},
#'   journal = {The Harvard Kennedy School Misinformation Review},
#'   volume = {},
#'   number = {},
#'   doi = {},
#'   url = {}
#'   }
#' 
#' Copyright (c) 2020-2021 Maria Y. Rodriguez
#' This project is licensed under the terms of the MIT license.
#' 
#' Reviewed by Momin M. Malik
################################################################################
library(tidyverse)
library(tidytext)
library(igraph)
library(ggraph)
library(tidygraph)
library(wordcloud)
# library(maps)
# install.packages("devtools")
# devtools::install_github("hadley/emo")
# devtools::install_github("gadenbuie/tweetrmd")
# devtools::install_github("rstudio/webshot2")
# devtools::install_github("dgrtwo/widyr")
library(emo)
library(tweetrmd)
library(webshot2)
library(widyr)

library(stm)
library(stminsights)
library(wordcloud)

# Note: tweets pulled from API (and from scraping, too) are a non-random and 
# incomplete sample. Frequency counts are all relative to this sample.


######################
# Basic descriptives #
######################

# Frequency 
df %>%
  ts_plot("3 hours") +
  ggplot2::theme_minimal() +
  ggplot2::theme(plot.title = ggplot2::element_text(face = "bold")) +
  ggplot2::labs(
    x = NULL, y = NULL,
    title = "Time series of tweets"
  )

# Most frequently shared links
df %>% 
  filter(!is.na(urls_expanded_url)) %>% 
  count(urls_expanded_url, sort = TRUE) %>% 
  top_n(5)

# Most retweeted tweets
df %>% 
  arrange(-retweet_count) %>%
  slice(1) %>% 
  select(created_at, screen_name, text, retweet_count, status_id)

# Most liked tweets
df %>% 
  arrange(-favorite_count) %>%
  top_n(5, favorite_count) %>% 
  select(created_at, screen_name, text, favorite_count)

# Top emojis
df %>%
  mutate(emoji = ji_extract_all(text)) %>%
  unnest(cols = c(emoji)) %>%
  count(emoji, sort = TRUE) %>%
  top_n(10)

# Top hashtags
df %>% 
  unnest_tokens(hashtag, text, "tweets", to_lower = FALSE) %>%
  filter(str_detect(hashtag, "^#"),
         hashtag != "#ADOS") %>%
  count(hashtag, sort = TRUE) %>%
  top_n(20)

# Top mentions
df %>% 
  unnest_tokens(mentions, text, "tweets", to_lower = FALSE) %>%
  filter(str_detect(mentions, "^@")) %>%  
  count(mentions, sort = TRUE) %>%
  top_n(10)

# Top words
words <- df %>%
  mutate(text = str_remove_all(text, "&amp;|&lt;|&gt;"),
         text = str_remove_all(text, "\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)"),
         text = str_remove_all(text, "[^\x01-\x7F]")) %>% 
  unnest_tokens(word, text, token = "tweets") %>%
  filter(!word %in% stop_words$word,
         !word %in% str_remove_all(stop_words$word, "'"),
         str_detect(word, "[a-z]"),
         !str_detect(word, "^#"),         
         !str_detect(word, "@\\S+")) %>%
  count(word, sort = TRUE)

words %>% 
  with(wordcloud(word, n, random.order = FALSE, max.words = 100, colors = "#F29545"))

# Top tweeters
top <- df %>% 
  count(screen_name, sort = TRUE) %>%
  top_n(10) %>%
  mutate(screen_name = paste0("@", screen_name))
top


#################
# User analysis #
#################

# Get friends of top users (or any others of interest)
fds <- get_friends(top)
tbl <- table(fds$user_id)

## Subset friends data to only those followed by 3 or more
fds2 <- subset(fds, user_id %in% names(tbl[tbl > 2L]))

# Convert to graph object
g <- igraph::graph_from_edgelist(as.matrix(fds2))

# Plot network
plot(g)

#####
# Geographic data

# Top places
df %>% 
  filter(!is.na(place_full_name)) %>% 
  count(place_full_name, sort = TRUE) %>% 
  top_n(10)

# Create lat/lng variables using all available tweet and profile geo-location data
geo <- lat_lng(df)

# Plot state boundaries
par(mar = c(0, 0, 0, 0))
maps::map("state", lwd = .25)

# Plot lat and lng points onto state map
with(geo, points(lng, lat, pch = 20, cex = .75, col = rgb(0, .3, .7, .75)))


#####################
# Language analysis #
#####################

# Remove "http"s manually
df$stripped_text <- gsub("http.*", "",  df$text)
df$stripped_text <- gsub("https.*" ,"", df$stripped_text)

# Remove punctuation, convert to lowercase, add id for each tweet
df_clean <- df %>%
  dplyr::select(stripped_text) %>%
  unnest_tokens(word, stripped_text)

# Remove stop words
data("stop_words")
df_words <- df_clean %>%
  anti_join(stop_words)

# Plot top words
df_words %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(y = "Count",
       x = "Unique words",
       title = "Count of unique words")

# Remove punctuation, convert to lowercase, add id for each tweet
df_paired_words <- df %>%
  dplyr::select(stripped_text) %>%
  unnest_tokens(paired_words, stripped_text, token = "ngrams", n = 2)

df_paired_words %>%
  count(paired_words, sort = TRUE)

library(tidyr)
df_separated_words <- df_paired_words %>%
  separate(paired_words, c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# New bigram counts
df_words_counts <- df_separated_words %>%
  count(word1, word2, sort = TRUE)

# Plot word network
df_words_counts %>%
  filter(n >= 12) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
  geom_node_point(color = "darkslategray4", size = 1.8) +
  geom_node_text(aes(label = name), vjust = 1.3, size = 2) +
  labs(title = "Word Network: Tweets using the hashtag ADOS",
       subtitle = "August 20th - 26th,2020",
       x = "", y = "")

# Join sentiment classification to the tweet words
df_sentiment <- df_words %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

df_sentiment %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(title = "Sentiment of tweets",
       y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()


##################
# Topic modeling #
##################

# Remove emojis
df$text2 <- sapply(df$text, function(row) iconv(row, "latin1", "ASCII", sub=""))

# Remove retweets (there shouldn't be any but just in case)
df <- df %>% dplyr::filter(is_retweet == "FALSE")

# Begin stm 
processed <- textProcessor(df$text2, metadata = df)

#Prepare
plotRemoved(processed$documents, lower.thresh = seq(1,100, by = 10))

out <- prepDocuments(processed$documents, processed$vocab, processed$meta, lower.thresh = 10)

docs <- out$documents
vocab <- out$vocab
meta <- out$meta

# Inspect to make sure preprocessing went ok
head(docs) # How many words are in what position 
head(vocab)
head(meta)

#Estimate
stm_noK <- stm(documents = out$documents, vocab = out$vocab, K= 0, 
                data = out$meta, init.type = "Spectral")

labelTopics(stm_noK)

# Plot top 20 topics with 7 words using frex labels
plot.STM(stm_noK, 
         type="summary", 
         xlim=c(0, .3), 
         ylim=c(46,66), 
         labeltype = ("frex"), 
         n=7)

# Visualizes tweets from a topic, then makes a world cloud of the topic
# Example: topic 17 is of interest
topic17 <- findThoughts(stm_noK, texts=meta$text, topics=17, n=2)
topic17
plotQuote(topic17$docs[[1]], main = "Topic 17")
cloud(stm_noK, topic = 17, scale = c(2, .50))